pacman::p_load(
tidyverse,
here,
RColorBrewer,
lubridate,
scales,
GGally,
stats,
corrplot,
mice,
VIM
)
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))
Rows: 327346 Columns: 38
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): origin, origin_name, dest, dest_name, carrier, carrier_name, tailnum, manufacturer, model, engine, type
dbl (22): dep_delay, arr_delay, air_time, distance, flight, engines, seats, aircraft_age, lat, lon, alt, wind_di...
dttm (5): dep_time, sched_dep_time, arr_time, sched_arr_time, time_hour
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
summary(flights_dt)
dep_time sched_dep_time dep_delay origin origin_name
Min. :2013-01-01 05:17:00 Min. :2013-01-01 05:15:00 Min. : -43.00 Length:327346 Length:327346
1st Qu.:2013-04-05 05:56:00 1st Qu.:2013-04-05 06:00:00 1st Qu.: -5.00 Class :character Class :character
Median :2013-07-04 09:52:30 Median :2013-07-04 09:54:30 Median : -2.00 Mode :character Mode :character
Mean :2013-07-03 18:10:05 Mean :2013-07-03 18:02:48 Mean : 12.56
3rd Qu.:2013-10-01 18:12:45 3rd Qu.:2013-10-01 18:14:45 3rd Qu.: 11.00
Max. :2013-12-31 23:56:00 Max. :2013-12-31 23:59:00 Max. :1301.00
arr_time sched_arr_time arr_delay dest
Min. :2013-01-01 00:03:00 Min. :2013-01-01 00:05:00 Min. : -86.000 Length:327346
1st Qu.:2013-04-05 01:03:00 1st Qu.:2013-04-05 03:43:30 1st Qu.: -17.000 Class :character
Median :2013-07-04 11:36:30 Median :2013-07-04 11:55:30 Median : -5.000 Mode :character
Mean :2013-07-03 19:41:02 Mean :2013-07-03 19:59:24 Mean : 6.895
3rd Qu.:2013-10-01 20:07:00 3rd Qu.:2013-10-01 20:20:30 3rd Qu.: 14.000
Max. :2014-01-01 00:00:00 Max. :2013-12-31 23:59:00 Max. :1272.000
dest_name air_time distance flight carrier carrier_name
Length:327346 Min. : 20.0 Min. : 80 Min. : 1 Length:327346 Length:327346
Class :character 1st Qu.: 82.0 1st Qu.: 509 1st Qu.: 544 Class :character Class :character
Mode :character Median :129.0 Median : 888 Median :1467 Mode :character Mode :character
Mean :150.7 Mean :1048 Mean :1943
3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:3412
Max. :695.0 Max. :4983 Max. :8500
tailnum manufacturer model engines seats engine
Length:327346 Length:327346 Length:327346 Min. :1.00 Min. : 2.0 Length:327346
Class :character Class :character Class :character 1st Qu.:2.00 1st Qu.: 55.0 Class :character
Mode :character Mode :character Mode :character Median :2.00 Median :149.0 Mode :character
Mean :1.99 Mean :137.5
3rd Qu.:2.00 3rd Qu.:189.0
Max. :4.00 Max. :450.0
NA's :48329 NA's :48329
aircraft_age lat lon alt time_hour wind_dir
Min. : 8.00 Min. :21.32 Min. :-157.92 Min. : 3.0 Min. :2013-01-01 10:00:00 Min. : 0.0
1st Qu.:15.00 1st Qu.:32.90 1st Qu.: -95.34 1st Qu.: 26.0 1st Qu.:2013-04-05 10:00:00 1st Qu.:130.0
Median :19.00 Median :36.08 Median : -83.99 Median : 433.0 Median :2013-07-04 13:00:00 Median :220.0
Mean :19.59 Mean :35.97 Mean : -89.61 Mean : 583.8 Mean :2013-07-03 21:56:45 Mean :201.9
3rd Qu.:22.00 3rd Qu.:41.41 3rd Qu.: -80.15 3rd Qu.: 748.0 3rd Qu.:2013-10-01 22:00:00 3rd Qu.:290.0
Max. :65.00 Max. :61.17 Max. : -68.83 Max. :6602.0 Max. :2014-01-01 04:00:00 Max. :360.0
NA's :53493 NA's :7537 NA's :7537 NA's :7537 NA's :9574
humid hour minute year type temp
Min. : 12.74 Min. : 5.00 Min. : 0.00 Min. :1956 Length:327346 Min. : 10.94
1st Qu.: 43.74 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:1999 Class :character 1st Qu.: 42.08
Median : 57.22 Median :13.00 Median :29.00 Median :2002 Mode :character Median : 57.20
Mean : 59.21 Mean :13.14 Mean :26.23 Mean :2001 Mean : 57.01
3rd Qu.: 74.67 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2006 3rd Qu.: 71.96
Max. :100.00 Max. :23.00 Max. :59.00 Max. :2013 Max. :100.04
NA's :1544 NA's :53493 NA's :1544
dewp wind_speed precip pressure visib
Min. :-9.94 Min. : 0.000 Min. :0.0000 Min. : 983.8 Min. : 0.00
1st Qu.:26.06 1st Qu.: 6.905 1st Qu.:0.0000 1st Qu.:1012.9 1st Qu.:10.00
Median :42.80 Median :10.357 Median :0.0000 Median :1017.6 Median :10.00
Mean :41.50 Mean :11.060 Mean :0.0042 Mean :1017.9 Mean : 9.29
3rd Qu.:57.92 3rd Qu.:14.960 3rd Qu.:0.0000 3rd Qu.:1022.9 3rd Qu.:10.00
Max. :78.08 Max. :42.579 Max. :1.2100 Max. :1042.1 Max. :10.00
NA's :1544 NA's :1605 NA's :1527 NA's :36142 NA's :1527
Which airport in our dataset has the highest number of departures?
flights_dt %>%
group_by(origin) %>%
ggplot() +
aes(x = origin, fill = origin) +
geom_bar() +
scale_fill_manual(values = cbPalette) +
labs(
x = "airport",
y = "departure numbers",
title = "Departure numbers by airport"
) +
guides(fill = "none") +
theme_bw()

What is the trend of delays over the year, how does it compare across the three airports?
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = month, y = mean_delay, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "mean delay (minutes)",
title = " Mean delay by month",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()

What is the number of departure delays over the year?
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(no_of_delays = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_delays, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "number of delays",
title = " Monthly trend of departure delays",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()

What is the number of flights per month?
flights_dt %>%
mutate(month = month(sched_dep_time, label = TRUE)) %>%
group_by(month, origin) %>%
summarise(no_of_flights = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_flights, fill = origin) +
geom_col(position = "dodge", alpha = 0.8) +
labs(
x = "month",
y = "number of departures",
title = "Departure numbers by month",
fill = "airport"
) +
scale_fill_manual(values = cbPalette) +
theme_bw()

Which carrier has the most departure delays?
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean departure delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()

Which carrier has the most arrival delays?
flights_dt %>%
filter(arr_delay > 0) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(arr_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean arrival delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()

Which season has the highest number of departure delays?
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = day, y = mean_delay, colour = origin) +
geom_point(alpha = 0.8) +
labs(
x = "date",
y = "mean delay (minutes)",
title = "Mean departure delay by month"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()

Over the course of a day when is the largest average delay?
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, hour) %>%
group_by(hour) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = hour, y = mean_delay) +
geom_point(alpha = 0.8, colour = "#56B4E9") +
geom_smooth(se = FALSE, colour = "#E69F00") +
labs(
x = "hour",
y = "mean delay (minutes)",
title = "Mean departure delay by departure time"
) +
theme_bw() +
scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

How does distance of an outbound flight affect departure delays?
flights_dt %>%
filter(origin == "EWR") %>%
ggplot() +
aes(x = distance) +
geom_histogram(bins = 50) +
theme_bw() +
labs(title = "Distribution of flight distance")

positions <- c("<500mi", "500-1000mi", "1000-1500mi", "1500-2000mi", "2000-2500mi", "2500-3000mi",
">3000mi")
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
mutate(distance = case_when(
distance <= 500 ~ "<500mi",
distance > 500 & distance <= 1000 ~ "500-1000mi",
distance > 1000 & distance <= 1500 ~ "1000-1500mi",
distance > 1500 & distance <= 2000 ~ "1500-2000mi",
distance > 2000 & distance <= 2500 ~ "2000-2500mi",
distance > 2500 & distance <= 3000 ~ "2500-3000mi",
distance > 3000 ~ ">3000mi"
)) %>%
group_by(distance) %>%
summarise(mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = distance, y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "distance (miles)",
y = "mean departure delay (minutes)",
title = "Departure delay by flight distance"
) +
scale_x_discrete(limits = positions) +
theme_bw()

Which day is the best day to travel from Newark Int.?
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
mutate(weekday = wday(sched_dep_time, label = TRUE)) %>%
group_by(weekday) %>%
summarise(mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = weekday, y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "weekday",
y = "mean departure delay (minutes)",
title = "Departure delay by weekday"
) +
theme_bw()

Which are the most popular destinations from Newark Int?
flights_dt %>%
filter(origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, count), y = count) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "number of flights per year",
title = "Destinations from Newark Int."
) +
theme_bw() +
coord_flip()

Which destinations suffer from the longest delays?
flights_dt %>%
drop_na(dest_name) %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_delay)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "mean delay (minutes)",
title = "Destinations with longest delays"
) +
theme_bw() +
coord_flip()

Trend weather conditions against mean departure delays
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_visibility = mean(visib, na.rm = TRUE),
mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = mean_visibility, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean visibility (miles)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by visibility"
) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_speed, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind speed (mph)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind speed"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_dir, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind direction (degrees)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind direction"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_humidity = mean(humid, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_humidity, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean humidity (%)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by humidity"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_temp = mean(temp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_temp, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean temperature (degf)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by temperature"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_dewpoint, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean dewpoint (degF)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by dewpoint"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_precip = mean(precip, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_precip, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean precipitation (inches)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by precipitation"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_pressure = mean(pressure, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_pressure, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean pressure (mbar)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by pressure"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).

---
title: "Exploratory Analysis"
output: html_notebook
---

```{r, warning=FALSE,message=FALSE}
pacman::p_load(
  tidyverse,
  here,
  RColorBrewer,
  lubridate,
  scales,
  GGally,
  stats,
  corrplot,
  leaps,
  glmulti,
  broom,
  rpart,
  rpart.plot,
  modelr,
  yardstick,
  caret,
  ranger
)
```

```{r, warning=FALSE,message=FALSE}
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

summary(flights_dt)
```

# Which airport in our dataset has the highest number of departures?

```{r}
flights_dt %>% 
  group_by(origin) %>%
  ggplot() +
  aes(x = origin, fill = origin) +
  geom_bar() +
  scale_fill_manual(values = cbPalette) +
  labs(
    x = "airport",
    y = "departure numbers",
    title = "Departure numbers by airport"
  ) +
  guides(fill = "none") +
  theme_bw()
```

# What is the trend of delays over the year, how does it compare across the three airports?

```{r,warning=FALSE}
flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = mean_delay, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "mean delay (minutes)",
    title = " Mean delay by month",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# What is the number of departure delays over the year?

```{r}
flights_dt %>%
  filter(dep_delay > 0) %>% 
  group_by(month = floor_date(sched_dep_time, "month"), origin) %>% 
  summarise(no_of_delays = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_delays, colour = origin) +
  geom_line(alpha = 0.6) +
  geom_point(alpha = 0.8) +
  labs(
    x = "month",
    y = "number of delays",
    title = " Monthly trend of departure delays",
    colour = "airport"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# What is the number of flights per month?

```{r}
flights_dt %>%
  mutate(month = month(sched_dep_time, label = TRUE)) %>% 
  group_by(month, origin) %>% 
  summarise(no_of_flights = n(), .groups = "drop") %>% 
  ggplot() +
  aes(x = month, y = no_of_flights, fill = origin) +
  geom_col(position = "dodge", alpha = 0.8) +
  labs(
    x = "month",
    y = "number of departures",
    title = "Departure numbers by month",
    fill = "airport"
  ) +
  scale_fill_manual(values = cbPalette) +
  theme_bw()
```

# Which carrier has the most departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean departure delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which carrier has the most arrival delays?

```{r}
flights_dt %>% 
  filter(arr_delay > 0) %>% 
  group_by(carrier_name, origin) %>% 
  summarise(mean_delay = mean(arr_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
  geom_col(alpha = 0.8) +
  facet_wrap(~ origin) +
  labs(
    x = "carrier",
    y = "mean delay (minutes)",
    title = "Mean arrival delay by carrier",
    fill = "airport"
  ) +
  theme_bw() +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
  coord_flip()
```

# Which season has the highest number of departure delays?

```{r}
flights_dt %>% 
  filter(dep_delay > 0) %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = day, y = mean_delay, colour = origin) +
  geom_point(alpha = 0.8) +
  labs(
    x = "date",
    y = "mean delay (minutes)",
    title = "Mean departure delay by month"
  ) +
  scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
  theme_bw()
```

# Over the course of a day when is the largest average delay?

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  select(dep_delay, hour) %>%
  group_by(hour) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  ggplot() +
  aes(x = hour, y = mean_delay) +
  geom_point(alpha = 0.8, colour = "#56B4E9") +
  geom_smooth(se = FALSE, colour = "#E69F00") +
  labs(
    x = "hour",
    y = "mean delay (minutes)",
    title = "Mean departure delay by departure time"
  ) +
  theme_bw() +
  scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
```
# How does distance of an outbound flight affect departure delays?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  ggplot() +
  aes(x = distance) +
  geom_histogram(bins = 50) +
  theme_bw() +
  labs(title = "Distribution of flight distance")
```

```{r}
positions <- c("<500mi", "500-1000mi", "1000-1500mi", "1500-2000mi", "2000-2500mi", "2500-3000mi",
               ">3000mi")

flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  mutate(distance = case_when(
    distance <= 500 ~ "<500mi",
    distance > 500 & distance <= 1000 ~ "500-1000mi",
    distance > 1000 & distance <= 1500 ~ "1000-1500mi",
    distance > 1500 & distance <= 2000 ~ "1500-2000mi",
    distance > 2000 & distance <= 2500 ~ "2000-2500mi",
    distance > 2500 & distance <= 3000 ~ "2500-3000mi",
    distance > 3000 ~ ">3000mi"
  )) %>% 
  group_by(distance) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = distance, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "distance (miles)",
    y = "mean departure delay (minutes)",
    title = "Departure delay by flight distance"
  ) +
  scale_x_discrete(limits = positions) +
  theme_bw()
```

# Which day is the best day to travel from Newark Int.?

```{r}
flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  mutate(weekday = wday(sched_dep_time, label = TRUE)) %>% 
  group_by(weekday) %>% 
  summarise(mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = weekday, y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "weekday",
    y = "mean departure delay (minutes)",
    title = "Departure delay by weekday"
  ) +
  theme_bw()
```

# Which are the most popular destinations from Newark Int?

```{r}
flights_dt %>% 
  filter(origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(desc(count)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, count), y = count) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "number of flights per year",
    title = "Destinations from Newark Int."
  ) +
  theme_bw() +
  coord_flip()
```

# Which destinations suffer from the longest delays?

```{r}
flights_dt %>%
  drop_na(dest_name) %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(dest, dest_name) %>% 
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>% 
  arrange(desc(mean_delay)) %>% 
  head(10) %>% 
  ggplot() +
  aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
  geom_col(fill = "#999999", alpha = 0.8) +
  labs(
    x = "destination",
    y = "mean delay (minutes)",
    title = "Destinations with longest delays"
  ) +
  theme_bw() +
  coord_flip()
```

# Trend weather conditions against mean departure delays

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_visibility = mean(visib, na.rm = TRUE),
            mean_delay = mean(dep_delay), .groups = "drop") %>% 
  ggplot() +
  aes(x = mean_visibility, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean visibility (miles)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by visibility"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_speed, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind speed (mph)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind speed"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_wind_dir, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean wind direction (degrees)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by wind direction"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_humidity = mean(humid, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_humidity, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean humidity (%)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by humidity"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_temp = mean(temp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_temp, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean temperature (degf)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by temperature"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_dewpoint, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean dewpoint (degF)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by dewpoint"
  ) +
  theme_bw()
```

```{r}
flights_dt %>% 
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_precip = mean(precip, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_precip, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean precipitation (inches)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by precipitation"
  ) +
  theme_bw()
```

```{r}
flights_dt %>%
  filter(dep_delay > 0 & origin == "EWR") %>% 
  group_by(day = floor_date(sched_dep_time, "day"), origin) %>% 
  summarise(mean_pressure = mean(pressure, na.rm = TRUE),
            mean_delay = mean(dep_delay)) %>% 
  ggplot() +
  aes(x = mean_pressure, y = mean_delay) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
  labs(
    x = "mean pressure (mbar)",
    y = "mean departure delay (minutes)",
    title = "Mean departure delay by pressure"
  ) +
  theme_bw()
```



